# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class TxSpider(CrawlSpider):
    name = 'tx'
    allowed_domains = ['tencent.com']
    start_urls = ['http://hr.tencent.com/position.php']

    rules = (
        # 提取列表页url地址
        Rule(LinkExtractor(allow=r'position.php\?&start=\d+#a0'), callback="parse_list", follow=True),
        # 提取详情页url地址
        Rule(LinkExtractor(allow=r'position_detail.php\?id=\d+&keywords=&tid=0&lid=0'), callback="parse_item"),
    )

    def parse_list(self, response):
        """提取发布日期"""
        pass

    def parse_item(self, response):
        item = {}
        item['title'] = response.xpath('//td[@id="sharetitle"]/text()').extract_first()
        item['duty'] = response.xpath('//div[text()="工作职责："]/following-sibling::ul[1]/li/text()').extract()
        item['job_require'] = response.xpath('//div[text()="工作要求："]/following-sibling::ul[1]/li/text()').extract()
        # return item
        print(item)